UHF_zipcode =
read_csv("./data/appb.csv") %>%
slice(-43) %>%
select(-Borough) %>%
rename("UHF" = "UHF Neighborhood") %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Borough = col_character(),
## `UHF Neighborhood` = col_character(),
## `Zip Code` = col_character()
## )
raw_hiv =
GET("https://data.cityofnewyork.us/api/views/fju2-rdad/rows.csv") %>%
content("parsed") %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Year = col_integer(),
## Borough = col_character(),
## UHF = col_character(),
## Gender = col_character(),
## Age = col_character(),
## Race = col_character(),
## `HIV diagnoses` = col_integer(),
## `HIV diagnosis rate` = col_double(),
## `Concurrent diagnoses` = col_integer(),
## `% linked to care within 3 months` = col_integer(),
## `AIDS diagnoses` = col_integer(),
## `AIDS diagnosis rate` = col_double(),
## `PLWDHI prevalence` = col_double(),
## `% viral suppression` = col_integer(),
## Deaths = col_integer(),
## `Death rate` = col_double(),
## `HIV-related death rate` = col_double(),
## `Non-HIV-related death rate` = col_double()
## )
combine_hiv =
right_join(UHF_zipcode, raw_hiv, by = "uhf") %>%
janitor::clean_names() %>%
separate(zip_code, into = c("zipcode1", "zipcode2", "zipcode3",
"zipcode4", "zipcode5", "zipcode6", "zipcode7", "zipcode8",
"zipcode9"), sep = ", ") %>%
gather(key = zip_code, value = zipcode_value, zipcode1:zipcode9) %>%
filter(!is.na(zipcode_value)) %>%
rename("zipcode" = "zipcode_value") %>%
select(zipcode, everything(), -zip_code)
r = GET('http://data.beta.nyc//dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')
nyc_zipcode = readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
## No encoding supplied: defaulting to UTF-8.
zipcode_lat_lng = nyc_zipcode@data %>%
select(zipcode = postalCode, longitude, latitude) %>%
mutate(zipcode = as.character(zipcode))
combine_hiv1 =
full_join(zipcode_lat_lng, combine_hiv, by = "zipcode") %>%
mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude)) %>%
group_by(uhf) %>%
summarise(lng = mean(longitude),
lat = mean(latitude)) %>%
filter(!(uhf == "Pelhem - Throgs Neck"))
pums_raw = read_csv("./data/selected_pums.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## PUMA00 = col_integer(),
## PUMA10 = col_integer(),
## ADJINC = col_integer(),
## PINCP = col_integer(),
## RAC3P05 = col_integer(),
## RAC3P12 = col_integer()
## )
temp = tempfile(fileext = ".xls")
dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/nyc_zcta10_to_puma10.xls"
download.file(dataURL, destfile = temp, mode = 'wb')
zcta_to_puma = readxl::read_xls(temp, sheet = 2) %>%
select(zcta = zcta10, puma = puma10) %>%
mutate(puma = as.numeric(puma))
dataURL =
"http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/zip_to_zcta10_nyc_revised.xls"
download.file(dataURL, destfile = temp, mode = 'wb')
zip_to_zcta = readxl::read_xls(temp, sheet = 2) %>%
select(zipcode, zcta = zcta5)
pums_data =
pums_raw %>%
select(puma = PUMA10, income = PINCP, year = ADJINC) %>%
filter(puma != -9) # remove data from 2011 due to lack of area information
pums_data$year = recode(pums_data$year,
"1042852" = "2012",
"1025215" = "2013",
"1009585" = "2014",
"1001264" = "2015")
pums_data =
pums_data %>%
group_by(year, puma) %>%
summarise(mid_income = median(income, na.rm = TRUE)) %>%
ungroup() # calculate yearly median income for each area
puma_to_zipcode = right_join(zip_to_zcta, zcta_to_puma, by = "zcta") %>% # generaate a puma to zipcode file
select(puma, zipcode)
income_zipcode = right_join(pums_data, puma_to_zipcode, by = "puma") %>% # matching zipcode with median income data
select(year, zipcode, mid_income) %>%
mutate(year = as.numeric(year))
combine_hiv_income =
left_join(combine_hiv, income_zipcode, by = c("year", "zipcode"))
First, import and tidy data:
gender neighborhood VS hiv
neb_plot = hiv_data %>%
group_by(neighborhood, gender) %>%
filter(year != "ALL", borough != "All", neighborhood != "All", gender != "All") %>%
filter(age != "All") %>%
summarise(sum_hiv = sum(hiv_diagnoses)) %>%
ggplot(aes(x = reorder(neighborhood, sum_hiv), y = sum_hiv, color = gender)) +
coord_flip() +
geom_point() +
labs(
title = "Gender and Neighborhood Influence on HIV Incidence",
x = "Neighborhood",
y = "HIV diagnoses",
caption = "Data from the ..."
)
ggplotly(neb_plot)
The number of HIV diagnoses is higher among male than female in all neighborhoods. Beford Stuyvesant - Crown Heights have the most HIV diagnoses for both men and women.
age_plot = hiv_data %>%
filter(race == "All" & borough == "All" & age != "All") %>%
group_by(gender, age) %>%
summarise(sum_hiv = sum(hiv_diagnoses)) %>%
ggplot(aes(y = sum_hiv, x = age, fill = gender)) +
geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
scale_fill_brewer(palette = "Dark2") +
labs(
title = "Gender and Age Influence on HIV Incidence",
x = "Age range",
y = "HIV diagnoses",
caption = "Data from the ..."
)
ggplotly(age_plot)
race_plot = hiv_data %>%
filter(age == "All" & borough == "All" & race != "All") %>%
group_by(gender, race) %>%
summarise(sum_hiv = sum(hiv_diagnoses)) %>%
ggplot(aes(y = sum_hiv, x = reorder(race, sum_hiv), fill = gender)) +
geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
labs(
title = "Race and Gender Influence on HIV Incidence",
x = "Race",
y = "HIV diagnoses",
caption = "Data from the ..."
)
ggplotly(race_plot)
hiv diagnoses in borough with most hiv over years
hiv_data %>%
filter(borough != "All", neighborhood == "All", gender == "All", age == "All", race == "All") %>%
group_by(borough) %>%
summarize(sum_hiv = sum(hiv_diagnoses)) %>%
arrange(desc(sum_hiv))
## # A tibble: 5 x 2
## borough sum_hiv
## <chr> <int>
## 1 Brooklyn 3815
## 2 Manhattan 3536
## 3 Bronx 2736
## 4 Queens 2327
## 5 Staten Island 217
year_plot = hiv_data %>%
mutate(year = as.integer(year)) %>%
filter(borough == "Brooklyn" & gender == "Male" & age == "20 - 29") %>%
group_by(year, neighborhood) %>%
summarize(sum_hiv = sum(hiv_diagnoses)) %>%
ggplot(aes(x = year, y = sum_hiv, color = neighborhood)) +
geom_line()
ggplotly(year_plot)
income_plot = combine_hiv_income %>%
filter(year != 2011, gender == "All", age == "All", race == "All") %>%
group_by(uhf) %>%
summarise(sum_hiv = sum(hiv_diagnoses), mean_income = mean(mid_income)) %>%
ggplot(aes(x = mean_income, y = sum_hiv)) +
geom_point() +
labs(
title = "Income Influence on HIV Incidence",
x = "Average income of each neighborhood",
y = "HIV diagnoses",
caption = "Data from the ..."
)
ggplotly(income_plot)
Limitations:
We can not visualize the effect of age and race at the same time.
income_hiv %>%
filter(year != "2011" & age != "All") %>%
lm(hiv_diagnoses ~ borough + gender + age + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.983 | 0.302 | 3.252 | 0.001 |
| boroughBrooklyn | 0.297 | 0.281 | 1.060 | 0.289 |
| boroughManhattan | 3.091 | 0.331 | 9.332 | 0.000 |
| boroughQueens | -1.245 | 0.259 | -4.811 | 0.000 |
| boroughStaten Island | -4.376 | 0.397 | -11.016 | 0.000 |
| genderMale | 6.083 | 0.152 | 40.138 | 0.000 |
| age20 - 29 | 9.600 | 0.262 | 36.576 | 0.000 |
| age30 - 39 | 6.870 | 0.262 | 26.175 | 0.000 |
| age40 - 49 | 4.627 | 0.262 | 17.627 | 0.000 |
| age50 - 59 | 2.355 | 0.262 | 8.972 | 0.000 |
| age60+ | 0.427 | 0.262 | 1.626 | 0.104 |
| mid_income | 0.000 | 0.000 | -17.851 | 0.000 |
income_hiv %>%
filter(year != "2011" & race != "All") %>%
lm(hiv_diagnoses ~ borough + gender + race + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1.514 | 0.514 | 2.945 | 0.003 |
| boroughBrooklyn | 0.357 | 0.493 | 0.724 | 0.469 |
| boroughManhattan | 3.710 | 0.582 | 6.376 | 0.000 |
| boroughQueens | -1.494 | 0.454 | -3.287 | 0.001 |
| boroughStaten Island | -5.251 | 0.698 | -7.527 | 0.000 |
| genderMale | 7.299 | 0.266 | 27.425 | 0.000 |
| raceBlack | 10.932 | 0.421 | 25.978 | 0.000 |
| raceLatino/Hispanic | 9.027 | 0.421 | 21.451 | 0.000 |
| raceOther/Unknown | -1.380 | 0.421 | -3.278 | 0.001 |
| raceWhite | 3.628 | 0.421 | 8.621 | 0.000 |
| mid_income | 0.000 | 0.000 | -12.197 | 0.000 |
income_plot = income_hiv %>%
filter(year != "2011") %>%
group_by(uhf, year) %>%
summarise(sum_hiv = mean(hiv_diagnoses), mid_in = median(mid_income)) %>%
ggplot(aes(x = mid_in, y = sum_hiv, color = year)) +
geom_point() +
geom_smooth(method = lm) +
theme_bw() +
theme(legend.position = "None")
ggplotly(income_plot)
Income distribution in different neighborhood
income_dist = income_hiv %>%
ggplot(aes(y = mid_income, x = uhf)) +
geom_point(alpha = 0.1) +
coord_flip() +
theme_bw()
ggplotly(income_dist)
income_hiv %>%
filter(year != "2011" & age != "All") %>%
lm(hiv_diagnosis_rate ~ borough + gender + age + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 17.851 | 1.511 | 11.811 | 0.000 |
| boroughBrooklyn | -12.078 | 1.403 | -8.611 | 0.000 |
| boroughManhattan | 15.424 | 1.656 | 9.316 | 0.000 |
| boroughQueens | -22.620 | 1.293 | -17.492 | 0.000 |
| boroughStaten Island | -30.524 | 1.985 | -15.377 | 0.000 |
| genderMale | 39.822 | 0.757 | 52.582 | 0.000 |
| age20 - 29 | 44.060 | 1.312 | 33.589 | 0.000 |
| age30 - 39 | 33.215 | 1.312 | 25.322 | 0.000 |
| age40 - 49 | 27.986 | 1.312 | 21.336 | 0.000 |
| age50 - 59 | 13.841 | 1.312 | 10.552 | 0.000 |
| age60+ | -4.261 | 1.312 | -3.249 | 0.001 |
| mid_income | -0.001 | 0.000 | -18.326 | 0.000 |
income_hiv %>%
filter(year != "2011" & race != "All") %>%
lm(hiv_diagnosis_rate ~ borough + gender + race + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.795 | 2.459 | 0.323 | 0.747 |
| boroughBrooklyn | -11.951 | 2.357 | -5.070 | 0.000 |
| boroughManhattan | 22.135 | 2.783 | 7.955 | 0.000 |
| boroughQueens | -25.218 | 2.173 | -11.603 | 0.000 |
| boroughStaten Island | -31.251 | 3.336 | -9.367 | 0.000 |
| genderMale | 49.480 | 1.273 | 38.875 | 0.000 |
| raceBlack | 61.810 | 2.012 | 30.713 | 0.000 |
| raceLatino/Hispanic | 34.404 | 2.012 | 17.095 | 0.000 |
| raceOther/Unknown | 9.809 | 2.012 | 4.874 | 0.000 |
| raceWhite | 12.984 | 2.012 | 6.452 | 0.000 |
| mid_income | 0.000 | 0.000 | -1.729 | 0.084 |
income_plot_diag_rate = income_hiv %>%
filter(year != "2011") %>%
group_by(uhf, year) %>%
summarise(sum_hiv_diagnosis_rate = sum(hiv_diagnosis_rate), mid_in = median(mid_income)) %>%
ggplot(aes(x = mid_in, y = sum_hiv_diagnosis_rate, color = year)) +
geom_point() +
geom_smooth(method = lm) +
theme_bw() +
theme(legend.position = "None")
ggplotly(income_plot_diag_rate)
diagnoses_by_zipcode =
combine_hiv_income %>%
filter(gender == "All", age == "All", race == "All") %>%
mutate(zipcode = factor(zipcode)) %>%
group_by(zipcode, uhf) %>%
summarise(sum_diagnoses = sum(hiv_diagnoses), sum_diagnosis_rate = sum(hiv_diagnosis_rate))
map_data = geo_join(nyc_zipcode, diagnoses_by_zipcode, "postalCode", "zipcode")
points_spdf = combine_hiv1 %>%
filter(!is.na(lng))
coordinates(points_spdf) = ~lng + lat
proj4string(points_spdf) = proj4string(nyc_zipcode)
matches = over(points_spdf, nyc_zipcode)
points = full_join(matches, rename(diagnoses_by_zipcode, postalCode = zipcode), by = "postalCode")
## Warning: Column `postalCode` joining factors with different levels,
## coercing to character vector
pal1 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnoses, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"
leaflet(map_data) %>%
addTiles() %>%
addPolygons(color = "black", weight = 1,
fillColor = ~pal1(sum_diagnoses), fillOpacity = 0.8,
popup = ~stringr::str_c(uhf, " sum:", factor(sum_diagnoses))) %>%
addMarkers(~longitude, ~latitude,
popup = ~stringr::str_c(uhf, " sum:", factor(sum_diagnoses)), data = points) %>%
addProviderTiles("CartoDB.Positron") %>%
addLegend(position = "bottomright", pal = pal1, values = ~sum_diagnoses,
title = "The amount of HIV diagnoses", opacity = 0.8) %>%
setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored
pal2 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnosis_rate, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"
leaflet(map_data) %>%
addTiles() %>%
addPolygons(color = "black", weight = 1,
fillColor = ~pal1(sum_diagnosis_rate), fillOpacity = 0.8,
popup = ~stringr::str_c(uhf, " sum:", factor(sum_diagnosis_rate))) %>%
addMarkers(~longitude, ~latitude,
popup = ~stringr::str_c(uhf, " sum:", factor(sum_diagnosis_rate)), data = points) %>%
addProviderTiles("CartoDB.Positron") %>%
addLegend(position = "bottomright", pal = pal2, values = ~sum_diagnosis_rate,
title = "The amount of HIV diagnosis rate", opacity = 0.8) %>%
setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored
income_by_zipcode =
combine_hiv_income %>%
filter(year != "2011") %>%
filter(gender == "All", age == "All", race == "All") %>%
group_by(zipcode, uhf) %>%
summarise(mean_income = mean(mid_income))
map_data_income = geo_join(nyc_zipcode, income_by_zipcode, "postalCode", "zipcode")
pal3 = colorNumeric(palette = "Greens", domain = range(map_data_income@data$mean_income, na.rm = T))
leaflet(map_data_income) %>%
addTiles() %>%
addPolygons(color = "black", weight = 1,
fillColor = ~pal3(mean_income), fillOpacity = 0.8,
popup = ~stringr::str_c(uhf, " sum:", factor(mean_income))) %>%
addProviderTiles("CartoDB.Positron") %>%
addLegend(position = "bottomright", pal = pal3, values = ~mean_income,
title = "Mean income of each uhf", opacity = 0.8) %>%
setView(-73.98, 40.75, zoom = 11)